# Clear workspace
rm(list=ls()); graphics.off()
### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(skimr) # For nice data summaries
The dataset comprises of three main tables:
listings - Detailed listings data showing 96 atttributes for each of the listings. Some of the attributes which are intuitivly interesting are: price (continuous), longitude (continuous), latitude (continuous), listing_type (categorical), is_superhost (categorical), neighbourhood (categorical), ratings (continuous) among others.reviews - Detailed reviews given by the guests with 6 attributes. Key attributes include date (datetime), listing_id (discrete), reviewer_id (discrete) and comment (textual).calendar - Provides details about booking for the next year by listing. Four attributes in total including listing_id (discrete), date (datetime), available (categorical) and price (continuous).listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz')
listings %>% head()
calendar <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/calendar.csv.gz')
calendar %>% head()
reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/reviews.csv.gz')
reviews %>% head()
# The geodat of the hoods comes as a geojson, so we need the right package to load it
library(geojsonio)
neighbourhoods_geojson <- geojson_read( 'http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/neighbourhoods.geojson', what = "sp")
listings %<>% mutate(price = price %>% parse_number())
listings %>%
count(host_id, sort = TRUE)
Where are they?
listings %>%
filter(host_id == 187610263) %>%
count(neighbourhood_cleansed, sort = TRUE)
Dummy for professional host
listings %<>%
group_by(host_id) %>%
mutate(host_professional = n() >= 5) %>%
ungroup()
listings %>%
group_by(host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE))
-> Profressional hosts charge more…
listings %>%
group_by(neighbourhood_cleansed, host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE)) %>%
pivot_wider(names_from = host_professional, values_from = review)
-> This is true everywhere, yet in some hoods mnore tghan in others
listings %<>%
mutate(desc_lenght = description %>% str_count('\\w+')) %>%
mutate(desc_long = percent_rank(desc_lenght) > 0.9 )
listings %>%
group_by(desc_long) %>%
summarise(review = review_scores_rating %>% mean(na.rm =TRUE))
-> No overall effect
listings %<>%
mutate(party_place = accommodates >= 10)
listings %>%
filter(party_place == TRUE) %>%
group_by(neighbourhood_cleansed) %>%
summarize(n = n(),
review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE),
price_pp = (price / accommodates) %>% mean(na.rm = TRUE)) %>%
arrange(desc(n))
If you are on a tight budget, best go to Amager-Vest.
library(leaflet)
listings %>% leaflet() %>%
addTiles() %>%
addMarkers(~longitude, ~latitude,
labelOptions = labelOptions(noHide = F),
clusterOptions = markerClusterOptions(),
popup = paste0("<b> Name: </b>", listings$name,
"<br/><b> Host Name: </b>", listings$host_name,
"<br> <b> Price: </b>", listings$price,
"<br/><b> Room Type: </b>", listings$room_type,
"<br/><b> Property Type: </b>", listings$property_type
)) %>%
# setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron")
# Using broom to tidy the geojson
library(broom)
neighbourhoods_tidy <- neighbourhoods_geojson %>%
tidy(region = "neighbourhood")
neighbourhoods_tidy %>% glimpse()
Rows: 6,658
Columns: 7
$ long <dbl> 12.63094, 12.63126, 12.63221, 12.63160, 12.63154, 12.63153, 12.63153, 12.63153, 12.63157, 12.63158, 12.6…
$ lat <dbl> 55.67050, 55.67028, 55.66961, 55.66943, 55.66941, 55.66940, 55.66939, 55.66930, 55.66926, 55.66924, 55.6…
$ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 2…
$ hole <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
$ piece <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ group <fct> Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, …
$ id <chr> "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", …
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon() +
theme_void() +
coord_map()
neighborhood_agg <- listings %>%
group_by(neighbourhood_cleansed) %>%
summarise(n = n(),
price_mean = price %>% mean(na.rm = TRUE),
review_mean = review_scores_rating %>% mean(na.rm = TRUE))
neighbourhoods_tidy %<>%
left_join(neighborhood_agg, by = c('id' = 'neighbourhood_cleansed'))
Number of places
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = n)) +
geom_polygon() +
theme_void() +
coord_map()
Prices
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = price_mean)) +
geom_polygon() +
theme_void() +
coord_map()
Review scores
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = review_mean)) +
geom_polygon() +
theme_void() +
coord_map()